將檔案做拆分方便維護,例如:
├── constants
│ └── index.js
├── utils
│ ├── file.js
│ └── jobFormat.js
├── index.js
├── 104.js
├── cakeresume.js
├── yourator.js
└── package.json
constansts.js 放固定的常數
const BASE_URL = {
one04: "https://www.104.com.tw/jobs/search/",
yourator: "https://www.yourator.co/api/v4/jobs",
cakeresume: "https://www.cakeresume.com/jobs",
};
const KEYWORD_LIST = ["前端", "Front-end"];
module.exports = { BASE_URL, KEYWORD_LIST };
jobFormat.js 用來放整理資料結構或內容的函式
// utils/jobFormat.js
const { BASE_URL } = require("../constants.js");
const convertSalaryFormat = (jobList) => {
const keyword = {
year: "年",
month: "月",
tenThousand: "萬",
dollar: "元",
};
let data = jobList.map((job) => {
let salary = [],
salaryType = "";
// salaryType 區分類別
if (job.salary.includes(keyword.month)) {
salaryType = "month";
} else if (job.salary.includes(keyword.year)) {
salaryType = "year";
} else {
salaryType = "other";
}
// 以 - 或 ~ 切分,將 salary 轉成陣列
salary = job.salary.split(/-|~/);
salary = salary.map((item) => {
// match 萬字前面的數值
const withWordRegex = new RegExp(`\\d+\.?\\d?(?=${keyword.tenThousand})`);
// match 數值
const withThousandSeparatorRegex = /(\d+,?)+/;
let value;
if (item.includes(keyword.tenThousand)) {
value = item.match(withWordRegex)[0] * 10000;
} else if (withThousandSeparatorRegex.test(item)) {
value = Number(
item.match(withThousandSeparatorRegex)[0].replaceAll(",", "")
);
} else {
value = item;
}
return value;
});
return {
...job,
salary,
salaryType,
};
});
return data;
};
module.exports = { convertJobListFromYourator, convertSalaryFormat };
流程:由 Cloud Scheduler 呼叫 Cloud Functions,Functions 抓取資料並整理後存至 Firestore。
index.js 內容,Entry Point: init
// Cloud Functions
const functions = require('@google-cloud/functions-framework');
const { KEYWORD_LIST } = require("./constants");
const { convertSalaryFormat } = require("./utils/jobFormat.js");
const fetch104Job = require("./104.js");
const fetchYouratorJob = require("./yourator.js");
const fetchCakeresumeJob = require("./cakeresume.js");
// Firestore
const { initializeApp, cert } = require('firebase-admin/app');
const { getFirestore, FieldValue } = require('firebase-admin/firestore');
const serviceAccount = require('./serviceAccountKey.json');
initializeApp({
credential: cert(serviceAccount)
});
const db = getFirestore();
let startPage = 1;
let endPage = 10;
functions.http('init', async(req, res) => {
// 1. 抓取資料
let result = await fetchData();
// 2. 統一薪資格式
let data = convertSalaryFormat(result);
// 3. url 移除非文字,作為 key
data = data.map(item => {
return {
...item,
key: item.url.replace(/\W/g, '')
}});
// 4. 遍歷 data,以 job.key 為 document ID 依序寫入資料
data.forEach(async(job) => {
await db.collection('jobList').doc(job.key).set({...job, timestamp: FieldValue.serverTimestamp()});
})
res.status(200).send(`Total is ${result.length}`);
});
const getAll104Job = async (keyword, startPage) => {
let page = startPage;
let jobs = [];
let isEmpty = false;
do {
let data = await fetch104Job(keyword, page);
jobs = [...jobs, ...data];
isEmpty = !data.length;
page += 1;
} while (page <= endPage && !isEmpty);
return jobs;
};
const getAllYouratorJob = async (keyword, startPage) => {
// 略
};
const getAllCakeresumeJob = async (keyword, startPage) => {
// 略
};
// 每一組關鍵字都要 call getAll104Job & getAllYouratorJob & getAllCakeresumeJob
const fetchData = async () => {
let result = [];
for (let i = 0; i < KEYWORD_LIST.length; i++) {
let one04DataArr = await getAll104Job(KEYWORD_LIST[i], startPage);
let youratorDataArr = await getAllYouratorJob(KEYWORD_LIST[i], startPage);
let cakeResumeDataArr = await getAllCakeresumeJob(KEYWORD_LIST[i], startPage);
result = [...result, ...one04DataArr, ...youratorDataArr, ...cakeResumeDataArr];
}
return result;
};
*寫入 Firestore
db.collection("jobList")
.doc(job.key)
.set({
...job,
timestamp: FieldValue.serverTimestamp(),
});
collection:jobList
docID : 職缺 URL (已移除非文字部分)
feild:name、companyName、salary...等,再加上 timestamp (代表此筆資料最新抓取的時間)
建立 Cloud Scheduler 於每日 8:00 觸發 Cloud Functions 執行,可於 Firestore 介面看到寫入的資料